Geographic Analysis

Introduction

bla balal


import findspark
findspark.init()

from pyspark.sql import SparkSession
import pandas as pd
import plotly.express as px
import plotly.io as pio
import numpy as np

np.random.seed(42)

pio.renderers.default = "notebook"

# Initialize Spark Session
spark = SparkSession.builder.appName("LightcastData").getOrCreate()

# Load Data
df = spark.read.option("header", "true").option("inferSchema", "true").option("multiLine","true").option("escape", "\"").csv("./data/lightcast_job_postings.csv")

# Show Schema and Sample Data
print("---This is Diagnostic check, No need to print it in the final doc---")

df.printSchema() # comment this line when rendering the submission
df.show(5)
                                                                                
---This is Diagnostic check, No need to print it in the final doc---
root
 |-- ID: string (nullable = true)
 |-- LAST_UPDATED_DATE: string (nullable = true)
 |-- LAST_UPDATED_TIMESTAMP: timestamp (nullable = true)
 |-- DUPLICATES: integer (nullable = true)
 |-- POSTED: string (nullable = true)
 |-- EXPIRED: string (nullable = true)
 |-- DURATION: integer (nullable = true)
 |-- SOURCE_TYPES: string (nullable = true)
 |-- SOURCES: string (nullable = true)
 |-- URL: string (nullable = true)
 |-- ACTIVE_URLS: string (nullable = true)
 |-- ACTIVE_SOURCES_INFO: string (nullable = true)
 |-- TITLE_RAW: string (nullable = true)
 |-- BODY: string (nullable = true)
 |-- MODELED_EXPIRED: string (nullable = true)
 |-- MODELED_DURATION: integer (nullable = true)
 |-- COMPANY: integer (nullable = true)
 |-- COMPANY_NAME: string (nullable = true)
 |-- COMPANY_RAW: string (nullable = true)
 |-- COMPANY_IS_STAFFING: boolean (nullable = true)
 |-- EDUCATION_LEVELS: string (nullable = true)
 |-- EDUCATION_LEVELS_NAME: string (nullable = true)
 |-- MIN_EDULEVELS: integer (nullable = true)
 |-- MIN_EDULEVELS_NAME: string (nullable = true)
 |-- MAX_EDULEVELS: integer (nullable = true)
 |-- MAX_EDULEVELS_NAME: string (nullable = true)
 |-- EMPLOYMENT_TYPE: integer (nullable = true)
 |-- EMPLOYMENT_TYPE_NAME: string (nullable = true)
 |-- MIN_YEARS_EXPERIENCE: integer (nullable = true)
 |-- MAX_YEARS_EXPERIENCE: integer (nullable = true)
 |-- IS_INTERNSHIP: boolean (nullable = true)
 |-- SALARY: integer (nullable = true)
 |-- REMOTE_TYPE: integer (nullable = true)
 |-- REMOTE_TYPE_NAME: string (nullable = true)
 |-- ORIGINAL_PAY_PERIOD: string (nullable = true)
 |-- SALARY_TO: integer (nullable = true)
 |-- SALARY_FROM: integer (nullable = true)
 |-- LOCATION: string (nullable = true)
 |-- CITY: string (nullable = true)
 |-- CITY_NAME: string (nullable = true)
 |-- COUNTY: integer (nullable = true)
 |-- COUNTY_NAME: string (nullable = true)
 |-- MSA: integer (nullable = true)
 |-- MSA_NAME: string (nullable = true)
 |-- STATE: integer (nullable = true)
 |-- STATE_NAME: string (nullable = true)
 |-- COUNTY_OUTGOING: integer (nullable = true)
 |-- COUNTY_NAME_OUTGOING: string (nullable = true)
 |-- COUNTY_INCOMING: integer (nullable = true)
 |-- COUNTY_NAME_INCOMING: string (nullable = true)
 |-- MSA_OUTGOING: integer (nullable = true)
 |-- MSA_NAME_OUTGOING: string (nullable = true)
 |-- MSA_INCOMING: integer (nullable = true)
 |-- MSA_NAME_INCOMING: string (nullable = true)
 |-- NAICS2: integer (nullable = true)
 |-- NAICS2_NAME: string (nullable = true)
 |-- NAICS3: integer (nullable = true)
 |-- NAICS3_NAME: string (nullable = true)
 |-- NAICS4: integer (nullable = true)
 |-- NAICS4_NAME: string (nullable = true)
 |-- NAICS5: integer (nullable = true)
 |-- NAICS5_NAME: string (nullable = true)
 |-- NAICS6: integer (nullable = true)
 |-- NAICS6_NAME: string (nullable = true)
 |-- TITLE: string (nullable = true)
 |-- TITLE_NAME: string (nullable = true)
 |-- TITLE_CLEAN: string (nullable = true)
 |-- SKILLS: string (nullable = true)
 |-- SKILLS_NAME: string (nullable = true)
 |-- SPECIALIZED_SKILLS: string (nullable = true)
 |-- SPECIALIZED_SKILLS_NAME: string (nullable = true)
 |-- CERTIFICATIONS: string (nullable = true)
 |-- CERTIFICATIONS_NAME: string (nullable = true)
 |-- COMMON_SKILLS: string (nullable = true)
 |-- COMMON_SKILLS_NAME: string (nullable = true)
 |-- SOFTWARE_SKILLS: string (nullable = true)
 |-- SOFTWARE_SKILLS_NAME: string (nullable = true)
 |-- ONET: string (nullable = true)
 |-- ONET_NAME: string (nullable = true)
 |-- ONET_2019: string (nullable = true)
 |-- ONET_2019_NAME: string (nullable = true)
 |-- CIP6: string (nullable = true)
 |-- CIP6_NAME: string (nullable = true)
 |-- CIP4: string (nullable = true)
 |-- CIP4_NAME: string (nullable = true)
 |-- CIP2: string (nullable = true)
 |-- CIP2_NAME: string (nullable = true)
 |-- SOC_2021_2: string (nullable = true)
 |-- SOC_2021_2_NAME: string (nullable = true)
 |-- SOC_2021_3: string (nullable = true)
 |-- SOC_2021_3_NAME: string (nullable = true)
 |-- SOC_2021_4: string (nullable = true)
 |-- SOC_2021_4_NAME: string (nullable = true)
 |-- SOC_2021_5: string (nullable = true)
 |-- SOC_2021_5_NAME: string (nullable = true)
 |-- LOT_CAREER_AREA: integer (nullable = true)
 |-- LOT_CAREER_AREA_NAME: string (nullable = true)
 |-- LOT_OCCUPATION: integer (nullable = true)
 |-- LOT_OCCUPATION_NAME: string (nullable = true)
 |-- LOT_SPECIALIZED_OCCUPATION: integer (nullable = true)
 |-- LOT_SPECIALIZED_OCCUPATION_NAME: string (nullable = true)
 |-- LOT_OCCUPATION_GROUP: integer (nullable = true)
 |-- LOT_OCCUPATION_GROUP_NAME: string (nullable = true)
 |-- LOT_V6_SPECIALIZED_OCCUPATION: integer (nullable = true)
 |-- LOT_V6_SPECIALIZED_OCCUPATION_NAME: string (nullable = true)
 |-- LOT_V6_OCCUPATION: integer (nullable = true)
 |-- LOT_V6_OCCUPATION_NAME: string (nullable = true)
 |-- LOT_V6_OCCUPATION_GROUP: integer (nullable = true)
 |-- LOT_V6_OCCUPATION_GROUP_NAME: string (nullable = true)
 |-- LOT_V6_CAREER_AREA: integer (nullable = true)
 |-- LOT_V6_CAREER_AREA_NAME: string (nullable = true)
 |-- SOC_2: string (nullable = true)
 |-- SOC_2_NAME: string (nullable = true)
 |-- SOC_3: string (nullable = true)
 |-- SOC_3_NAME: string (nullable = true)
 |-- SOC_4: string (nullable = true)
 |-- SOC_4_NAME: string (nullable = true)
 |-- SOC_5: string (nullable = true)
 |-- SOC_5_NAME: string (nullable = true)
 |-- LIGHTCAST_SECTORS: string (nullable = true)
 |-- LIGHTCAST_SECTORS_NAME: string (nullable = true)
 |-- NAICS_2022_2: integer (nullable = true)
 |-- NAICS_2022_2_NAME: string (nullable = true)
 |-- NAICS_2022_3: integer (nullable = true)
 |-- NAICS_2022_3_NAME: string (nullable = true)
 |-- NAICS_2022_4: integer (nullable = true)
 |-- NAICS_2022_4_NAME: string (nullable = true)
 |-- NAICS_2022_5: integer (nullable = true)
 |-- NAICS_2022_5_NAME: string (nullable = true)
 |-- NAICS_2022_6: integer (nullable = true)
 |-- NAICS_2022_6_NAME: string (nullable = true)

+--------------------+-----------------+----------------------+----------+--------+---------+--------+--------------------+--------------------+--------------------+-----------+-------------------+--------------------+--------------------+---------------+----------------+--------+--------------------+-----------+-------------------+----------------+---------------------+-------------+-------------------+-------------+------------------+---------------+--------------------+--------------------+--------------------+-------------+------+-----------+----------------+-------------------+---------+-----------+--------------------+--------------------+-------------+------+--------------+-----+--------------------+-----+----------+---------------+--------------------+---------------+--------------------+------------+--------------------+------------+--------------------+------+--------------------+------+--------------------+------+--------------------+------+--------------------+------+--------------------+------------------+-------------------+--------------------+--------------------+--------------------+--------------------+-----------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+----------+--------------------+----------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+----------+--------------------+----------+--------------------+----------+---------------+----------+---------------+---------------+--------------------+--------------+--------------------+--------------------------+-------------------------------+--------------------+-------------------------+-----------------------------+----------------------------------+-----------------+----------------------+-----------------------+----------------------------+------------------+-----------------------+-------+--------------------+-------+--------------------+-------+---------------+-------+---------------+-----------------+----------------------+------------+--------------------+------------+--------------------+------------+--------------------+------------+--------------------+------------+--------------------+
|                  ID|LAST_UPDATED_DATE|LAST_UPDATED_TIMESTAMP|DUPLICATES|  POSTED|  EXPIRED|DURATION|        SOURCE_TYPES|             SOURCES|                 URL|ACTIVE_URLS|ACTIVE_SOURCES_INFO|           TITLE_RAW|                BODY|MODELED_EXPIRED|MODELED_DURATION| COMPANY|        COMPANY_NAME|COMPANY_RAW|COMPANY_IS_STAFFING|EDUCATION_LEVELS|EDUCATION_LEVELS_NAME|MIN_EDULEVELS| MIN_EDULEVELS_NAME|MAX_EDULEVELS|MAX_EDULEVELS_NAME|EMPLOYMENT_TYPE|EMPLOYMENT_TYPE_NAME|MIN_YEARS_EXPERIENCE|MAX_YEARS_EXPERIENCE|IS_INTERNSHIP|SALARY|REMOTE_TYPE|REMOTE_TYPE_NAME|ORIGINAL_PAY_PERIOD|SALARY_TO|SALARY_FROM|            LOCATION|                CITY|    CITY_NAME|COUNTY|   COUNTY_NAME|  MSA|            MSA_NAME|STATE|STATE_NAME|COUNTY_OUTGOING|COUNTY_NAME_OUTGOING|COUNTY_INCOMING|COUNTY_NAME_INCOMING|MSA_OUTGOING|   MSA_NAME_OUTGOING|MSA_INCOMING|   MSA_NAME_INCOMING|NAICS2|         NAICS2_NAME|NAICS3|         NAICS3_NAME|NAICS4|         NAICS4_NAME|NAICS5|         NAICS5_NAME|NAICS6|         NAICS6_NAME|             TITLE|         TITLE_NAME|         TITLE_CLEAN|              SKILLS|         SKILLS_NAME|  SPECIALIZED_SKILLS|SPECIALIZED_SKILLS_NAME|      CERTIFICATIONS| CERTIFICATIONS_NAME|       COMMON_SKILLS|  COMMON_SKILLS_NAME|     SOFTWARE_SKILLS|SOFTWARE_SKILLS_NAME|      ONET|           ONET_NAME| ONET_2019|      ONET_2019_NAME|                CIP6|           CIP6_NAME|                CIP4|           CIP4_NAME|                CIP2|           CIP2_NAME|SOC_2021_2|     SOC_2021_2_NAME|SOC_2021_3|     SOC_2021_3_NAME|SOC_2021_4|SOC_2021_4_NAME|SOC_2021_5|SOC_2021_5_NAME|LOT_CAREER_AREA|LOT_CAREER_AREA_NAME|LOT_OCCUPATION| LOT_OCCUPATION_NAME|LOT_SPECIALIZED_OCCUPATION|LOT_SPECIALIZED_OCCUPATION_NAME|LOT_OCCUPATION_GROUP|LOT_OCCUPATION_GROUP_NAME|LOT_V6_SPECIALIZED_OCCUPATION|LOT_V6_SPECIALIZED_OCCUPATION_NAME|LOT_V6_OCCUPATION|LOT_V6_OCCUPATION_NAME|LOT_V6_OCCUPATION_GROUP|LOT_V6_OCCUPATION_GROUP_NAME|LOT_V6_CAREER_AREA|LOT_V6_CAREER_AREA_NAME|  SOC_2|          SOC_2_NAME|  SOC_3|          SOC_3_NAME|  SOC_4|     SOC_4_NAME|  SOC_5|     SOC_5_NAME|LIGHTCAST_SECTORS|LIGHTCAST_SECTORS_NAME|NAICS_2022_2|   NAICS_2022_2_NAME|NAICS_2022_3|   NAICS_2022_3_NAME|NAICS_2022_4|   NAICS_2022_4_NAME|NAICS_2022_5|   NAICS_2022_5_NAME|NAICS_2022_6|   NAICS_2022_6_NAME|
+--------------------+-----------------+----------------------+----------+--------+---------+--------+--------------------+--------------------+--------------------+-----------+-------------------+--------------------+--------------------+---------------+----------------+--------+--------------------+-----------+-------------------+----------------+---------------------+-------------+-------------------+-------------+------------------+---------------+--------------------+--------------------+--------------------+-------------+------+-----------+----------------+-------------------+---------+-----------+--------------------+--------------------+-------------+------+--------------+-----+--------------------+-----+----------+---------------+--------------------+---------------+--------------------+------------+--------------------+------------+--------------------+------+--------------------+------+--------------------+------+--------------------+------+--------------------+------+--------------------+------------------+-------------------+--------------------+--------------------+--------------------+--------------------+-----------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+----------+--------------------+----------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+----------+--------------------+----------+--------------------+----------+---------------+----------+---------------+---------------+--------------------+--------------+--------------------+--------------------------+-------------------------------+--------------------+-------------------------+-----------------------------+----------------------------------+-----------------+----------------------+-----------------------+----------------------------+------------------+-----------------------+-------+--------------------+-------+--------------------+-------+---------------+-------+---------------+-----------------+----------------------+------------+--------------------+------------+--------------------+------------+--------------------+------------+--------------------+------------+--------------------+
|1f57d95acf4dc67ed...|         9/6/2024|  2024-09-06 20:32:...|         0|6/2/2024| 6/8/2024|       6|   [\n  "Company"\n]|[\n  "brassring.c...|[\n  "https://sjo...|         []|               NULL|Enterprise Analys...|31-May-2024\n\nEn...|       6/8/2024|               6|  894731|          Murphy USA| Murphy USA|              false|       [\n  2\n]| [\n  "Bachelor's ...|            2|  Bachelor's degree|         NULL|              NULL|              1|Full-time (> 32 h...|                   2|                   2|        false|  NULL|          0|          [None]|               NULL|     NULL|       NULL|{\n  "lat": 33.20...|RWwgRG9yYWRvLCBBUg==|El Dorado, AR|  5139|     Union, AR|20980|       El Dorado, AR|    5|  Arkansas|           5139|           Union, AR|           5139|           Union, AR|       20980|       El Dorado, AR|       20980|       El Dorado, AR|    44|        Retail Trade|   441|Motor Vehicle and...|  4413|Automotive Parts,...| 44133|Automotive Parts ...|441330|Automotive Parts ...|ET29C073C03D1F86B4|Enterprise Analysts|enterprise analys...|[\n  "KS126DB6T06...|[\n  "Merchandisi...|[\n  "KS126DB6T06...|   [\n  "Merchandisi...|                  []|                  []|[\n  "KS126706DPF...|[\n  "Mathematics...|[\n  "KS440W865GC...|[\n  "SQL (Progra...|15-2051.01|Business Intellig...|15-2051.01|Business Intellig...|[\n  "45.0601",\n...|[\n  "Economics, ...|[\n  "45.06",\n  ...|[\n  "Economics",...|[\n  "45",\n  "27...|[\n  "Social Scie...|   15-0000|Computer and Math...|   15-2000|Mathematical Scie...|   15-2050|Data Scientists|   15-2051|Data Scientists|             23|Information Techn...|        231010|Business Intellig...|                  23101011|           General ERP Analy...|                2310|     Business Intellig...|                     23101011|              General ERP Analy...|           231010|  Business Intellig...|                   2310|        Business Intellig...|                23|   Information Techn...|15-0000|Computer and Math...|15-2000|Mathematical Scie...|15-2050|Data Scientists|15-2051|Data Scientists|        [\n  7\n]|  [\n  "Artificial ...|          44|        Retail Trade|         441|Motor Vehicle and...|        4413|Automotive Parts,...|       44133|Automotive Parts ...|      441330|Automotive Parts ...|
|0cb072af26757b6c4...|         8/2/2024|  2024-08-02 17:08:...|         0|6/2/2024| 8/1/2024|    NULL| [\n  "Job Board"\n]| [\n  "maine.gov"\n]|[\n  "https://job...|         []|               NULL|Oracle Consultant...|Oracle Consultant...|       8/1/2024|            NULL|  133098|Smx Corporation L...|        SMX|               true|      [\n  99\n]| [\n  "No Educatio...|           99|No Education Listed|         NULL|              NULL|              1|Full-time (> 32 h...|                   3|                   3|        false|  NULL|          1|          Remote|               NULL|     NULL|       NULL|{\n  "lat": 44.31...|    QXVndXN0YSwgTUU=|  Augusta, ME| 23011|  Kennebec, ME|12300|Augusta-Watervill...|   23|     Maine|          23011|        Kennebec, ME|          23011|        Kennebec, ME|       12300|Augusta-Watervill...|       12300|Augusta-Watervill...|    56|Administrative an...|   561|Administrative an...|  5613| Employment Services| 56132|Temporary Help Se...|561320|Temporary Help Se...|ET21DDA63780A7DC09| Oracle Consultants|oracle consultant...|[\n  "KS122626T55...|[\n  "Procurement...|[\n  "KS122626T55...|   [\n  "Procurement...|                  []|                  []|                  []|                  []|[\n  "BGSBF3F508F...|[\n  "Oracle Busi...|15-2051.01|Business Intellig...|15-2051.01|Business Intellig...|                  []|                  []|                  []|                  []|                  []|                  []|   15-0000|Computer and Math...|   15-2000|Mathematical Scie...|   15-2050|Data Scientists|   15-2051|Data Scientists|             23|Information Techn...|        231010|Business Intellig...|                  23101012|           Oracle Consultant...|                2310|     Business Intellig...|                     23101012|              Oracle Consultant...|           231010|  Business Intellig...|                   2310|        Business Intellig...|                23|   Information Techn...|15-0000|Computer and Math...|15-2000|Mathematical Scie...|15-2050|Data Scientists|15-2051|Data Scientists|             NULL|                  NULL|          56|Administrative an...|         561|Administrative an...|        5613| Employment Services|       56132|Temporary Help Se...|      561320|Temporary Help Se...|
|85318b12b3331fa49...|         9/6/2024|  2024-09-06 20:32:...|         1|6/2/2024| 7/7/2024|      35| [\n  "Job Board"\n]|[\n  "dejobs.org"\n]|[\n  "https://dej...|         []|               NULL|        Data Analyst|Taking care of pe...|      6/10/2024|               8|39063746|            Sedgwick|   Sedgwick|              false|       [\n  2\n]| [\n  "Bachelor's ...|            2|  Bachelor's degree|         NULL|              NULL|              1|Full-time (> 32 h...|                   5|                NULL|        false|  NULL|          0|          [None]|               NULL|     NULL|       NULL|{\n  "lat": 32.77...|    RGFsbGFzLCBUWA==|   Dallas, TX| 48113|    Dallas, TX|19100|Dallas-Fort Worth...|   48|     Texas|          48113|          Dallas, TX|          48113|          Dallas, TX|       19100|Dallas-Fort Worth...|       19100|Dallas-Fort Worth...|    52|Finance and Insur...|   524|Insurance Carrier...|  5242|Agencies, Brokera...| 52429|Other Insurance R...|524291|    Claims Adjusting|ET3037E0C947A02404|      Data Analysts|        data analyst|[\n  "KS1218W78FG...|[\n  "Management"...|[\n  "ESF3939CE1F...|   [\n  "Exception R...|[\n  "KS683TN76T7...|[\n  "Security Cl...|[\n  "KS1218W78FG...|[\n  "Management"...|[\n  "KS126HY6YLT...|[\n  "Microsoft O...|15-2051.01|Business Intellig...|15-2051.01|Business Intellig...|                  []|                  []|                  []|                  []|                  []|                  []|   15-0000|Computer and Math...|   15-2000|Mathematical Scie...|   15-2050|Data Scientists|   15-2051|Data Scientists|             23|Information Techn...|        231113|Data / Data Minin...|                  23111310|                   Data Analyst|                2311|     Data Analysis and...|                     23111310|                      Data Analyst|           231113|  Data / Data Minin...|                   2311|        Data Analysis and...|                23|   Information Techn...|15-0000|Computer and Math...|15-2000|Mathematical Scie...|15-2050|Data Scientists|15-2051|Data Scientists|             NULL|                  NULL|          52|Finance and Insur...|         524|Insurance Carrier...|        5242|Agencies, Brokera...|       52429|Other Insurance R...|      524291|    Claims Adjusting|
|1b5c3941e54a1889e...|         9/6/2024|  2024-09-06 20:32:...|         1|6/2/2024|7/20/2024|      48| [\n  "Job Board"\n]|[\n  "disabledper...|[\n  "https://www...|         []|               NULL|Sr. Lead Data Mgm...|About this role:\...|      6/12/2024|              10|37615159|         Wells Fargo|Wells Fargo|              false|      [\n  99\n]| [\n  "No Educatio...|           99|No Education Listed|         NULL|              NULL|              1|Full-time (> 32 h...|                   3|                NULL|        false|  NULL|          0|          [None]|               NULL|     NULL|       NULL|{\n  "lat": 33.44...|    UGhvZW5peCwgQVo=|  Phoenix, AZ|  4013|  Maricopa, AZ|38060|Phoenix-Mesa-Chan...|    4|   Arizona|           4013|        Maricopa, AZ|           4013|        Maricopa, AZ|       38060|Phoenix-Mesa-Chan...|       38060|Phoenix-Mesa-Chan...|    52|Finance and Insur...|   522|Credit Intermedia...|  5221|Depository Credit...| 52211|  Commercial Banking|522110|  Commercial Banking|ET2114E0404BA30075|Management Analysts|sr lead data mgmt...|[\n  "KS123QX62QY...|[\n  "Exit Strate...|[\n  "KS123QX62QY...|   [\n  "Exit Strate...|                  []|                  []|[\n  "KS7G6NP6R6L...|[\n  "Reliability...|[\n  "KS4409D76NW...|[\n  "SAS (Softwa...|15-2051.01|Business Intellig...|15-2051.01|Business Intellig...|                  []|                  []|                  []|                  []|                  []|                  []|   15-0000|Computer and Math...|   15-2000|Mathematical Scie...|   15-2050|Data Scientists|   15-2051|Data Scientists|             23|Information Techn...|        231113|Data / Data Minin...|                  23111310|                   Data Analyst|                2311|     Data Analysis and...|                     23111310|                      Data Analyst|           231113|  Data / Data Minin...|                   2311|        Data Analysis and...|                23|   Information Techn...|15-0000|Computer and Math...|15-2000|Mathematical Scie...|15-2050|Data Scientists|15-2051|Data Scientists|        [\n  6\n]|  [\n  "Data Privac...|          52|Finance and Insur...|         522|Credit Intermedia...|        5221|Depository Credit...|       52211|  Commercial Banking|      522110|  Commercial Banking|
|cb5ca25f02bdf25c1...|        6/19/2024|   2024-06-19 07:00:00|         0|6/2/2024|6/17/2024|      15|[\n  "FreeJobBoar...|[\n  "craigslist....|[\n  "https://mod...|         []|               NULL|Comisiones de $10...|Comisiones de $10...|      6/17/2024|              15|       0|        Unclassified|      LH/GM|              false|      [\n  99\n]| [\n  "No Educatio...|           99|No Education Listed|         NULL|              NULL|              3|Part-time / full-...|                NULL|                NULL|        false| 92500|          0|          [None]|               year|   150000|      35000|{\n  "lat": 37.63...|    TW9kZXN0bywgQ0E=|  Modesto, CA|  6099|Stanislaus, CA|33700|         Modesto, CA|    6|California|           6099|      Stanislaus, CA|           6099|      Stanislaus, CA|       33700|         Modesto, CA|       33700|         Modesto, CA|    99|Unclassified Indu...|   999|Unclassified Indu...|  9999|Unclassified Indu...| 99999|Unclassified Indu...|999999|Unclassified Indu...|ET0000000000000000|       Unclassified|comisiones de por...|                  []|                  []|                  []|                     []|                  []|                  []|                  []|                  []|                  []|                  []|15-2051.01|Business Intellig...|15-2051.01|Business Intellig...|                  []|                  []|                  []|                  []|                  []|                  []|   15-0000|Computer and Math...|   15-2000|Mathematical Scie...|   15-2050|Data Scientists|   15-2051|Data Scientists|             23|Information Techn...|        231010|Business Intellig...|                  23101012|           Oracle Consultant...|                2310|     Business Intellig...|                     23101012|              Oracle Consultant...|           231010|  Business Intellig...|                   2310|        Business Intellig...|                23|   Information Techn...|15-0000|Computer and Math...|15-2000|Mathematical Scie...|15-2050|Data Scientists|15-2051|Data Scientists|             NULL|                  NULL|          99|Unclassified Indu...|         999|Unclassified Indu...|        9999|Unclassified Indu...|       99999|Unclassified Indu...|      999999|Unclassified Indu...|
+--------------------+-----------------+----------------------+----------+--------+---------+--------+--------------------+--------------------+--------------------+-----------+-------------------+--------------------+--------------------+---------------+----------------+--------+--------------------+-----------+-------------------+----------------+---------------------+-------------+-------------------+-------------+------------------+---------------+--------------------+--------------------+--------------------+-------------+------+-----------+----------------+-------------------+---------+-----------+--------------------+--------------------+-------------+------+--------------+-----+--------------------+-----+----------+---------------+--------------------+---------------+--------------------+------------+--------------------+------------+--------------------+------+--------------------+------+--------------------+------+--------------------+------+--------------------+------+--------------------+------------------+-------------------+--------------------+--------------------+--------------------+--------------------+-----------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+----------+--------------------+----------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+----------+--------------------+----------+--------------------+----------+---------------+----------+---------------+---------------+--------------------+--------------+--------------------+--------------------------+-------------------------------+--------------------+-------------------------+-----------------------------+----------------------------------+-----------------+----------------------+-----------------------+----------------------------+------------------+-----------------------+-------+--------------------+-------+--------------------+-------+---------------+-------+---------------+-----------------+----------------------+------------+--------------------+------------+--------------------+------------+--------------------+------------+--------------------+------------+--------------------+
only showing top 5 rows
## Listing Columns So We Can Reference them in Visuals

import pandas as pd
df = pd.read_csv("./data/lightcast_job_postings.csv")
print(df.columns.tolist())
/tmp/ipykernel_2561/3265774581.py:4: DtypeWarning:

Columns (19,30) have mixed types. Specify dtype option on import or set low_memory=False.
['ID', 'LAST_UPDATED_DATE', 'LAST_UPDATED_TIMESTAMP', 'DUPLICATES', 'POSTED', 'EXPIRED', 'DURATION', 'SOURCE_TYPES', 'SOURCES', 'URL', 'ACTIVE_URLS', 'ACTIVE_SOURCES_INFO', 'TITLE_RAW', 'BODY', 'MODELED_EXPIRED', 'MODELED_DURATION', 'COMPANY', 'COMPANY_NAME', 'COMPANY_RAW', 'COMPANY_IS_STAFFING', 'EDUCATION_LEVELS', 'EDUCATION_LEVELS_NAME', 'MIN_EDULEVELS', 'MIN_EDULEVELS_NAME', 'MAX_EDULEVELS', 'MAX_EDULEVELS_NAME', 'EMPLOYMENT_TYPE', 'EMPLOYMENT_TYPE_NAME', 'MIN_YEARS_EXPERIENCE', 'MAX_YEARS_EXPERIENCE', 'IS_INTERNSHIP', 'SALARY', 'REMOTE_TYPE', 'REMOTE_TYPE_NAME', 'ORIGINAL_PAY_PERIOD', 'SALARY_TO', 'SALARY_FROM', 'LOCATION', 'CITY', 'CITY_NAME', 'COUNTY', 'COUNTY_NAME', 'MSA', 'MSA_NAME', 'STATE', 'STATE_NAME', 'COUNTY_OUTGOING', 'COUNTY_NAME_OUTGOING', 'COUNTY_INCOMING', 'COUNTY_NAME_INCOMING', 'MSA_OUTGOING', 'MSA_NAME_OUTGOING', 'MSA_INCOMING', 'MSA_NAME_INCOMING', 'NAICS2', 'NAICS2_NAME', 'NAICS3', 'NAICS3_NAME', 'NAICS4', 'NAICS4_NAME', 'NAICS5', 'NAICS5_NAME', 'NAICS6', 'NAICS6_NAME', 'TITLE', 'TITLE_NAME', 'TITLE_CLEAN', 'SKILLS', 'SKILLS_NAME', 'SPECIALIZED_SKILLS', 'SPECIALIZED_SKILLS_NAME', 'CERTIFICATIONS', 'CERTIFICATIONS_NAME', 'COMMON_SKILLS', 'COMMON_SKILLS_NAME', 'SOFTWARE_SKILLS', 'SOFTWARE_SKILLS_NAME', 'ONET', 'ONET_NAME', 'ONET_2019', 'ONET_2019_NAME', 'CIP6', 'CIP6_NAME', 'CIP4', 'CIP4_NAME', 'CIP2', 'CIP2_NAME', 'SOC_2021_2', 'SOC_2021_2_NAME', 'SOC_2021_3', 'SOC_2021_3_NAME', 'SOC_2021_4', 'SOC_2021_4_NAME', 'SOC_2021_5', 'SOC_2021_5_NAME', 'LOT_CAREER_AREA', 'LOT_CAREER_AREA_NAME', 'LOT_OCCUPATION', 'LOT_OCCUPATION_NAME', 'LOT_SPECIALIZED_OCCUPATION', 'LOT_SPECIALIZED_OCCUPATION_NAME', 'LOT_OCCUPATION_GROUP', 'LOT_OCCUPATION_GROUP_NAME', 'LOT_V6_SPECIALIZED_OCCUPATION', 'LOT_V6_SPECIALIZED_OCCUPATION_NAME', 'LOT_V6_OCCUPATION', 'LOT_V6_OCCUPATION_NAME', 'LOT_V6_OCCUPATION_GROUP', 'LOT_V6_OCCUPATION_GROUP_NAME', 'LOT_V6_CAREER_AREA', 'LOT_V6_CAREER_AREA_NAME', 'SOC_2', 'SOC_2_NAME', 'SOC_3', 'SOC_3_NAME', 'SOC_4', 'SOC_4_NAME', 'SOC_5', 'SOC_5_NAME', 'LIGHTCAST_SECTORS', 'LIGHTCAST_SECTORS_NAME', 'NAICS_2022_2', 'NAICS_2022_2_NAME', 'NAICS_2022_3', 'NAICS_2022_3_NAME', 'NAICS_2022_4', 'NAICS_2022_4_NAME', 'NAICS_2022_5', 'NAICS_2022_5_NAME', 'NAICS_2022_6', 'NAICS_2022_6_NAME']
import pandas as pd

df = pd.read_csv("./data/lightcast_job_postings.csv")

columns_to_drop = [
    "ID", "URL", "ACTIVE_URLS", "DUPLICATES", "LAST_UPDATED_TIMESTAMP",
    "NAICS2", "NAICS3", "NAICS4", "NAICS5", "NAICS6",
    "SOC_2", "SOC_3", "SOC_5"
]

df.drop(columns=columns_to_drop, inplace=True)
/tmp/ipykernel_2561/304705447.py:3: DtypeWarning:

Columns (19,30) have mixed types. Specify dtype option on import or set low_memory=False.
print(df.columns.tolist())
['ID', 'LAST_UPDATED_DATE', 'LAST_UPDATED_TIMESTAMP', 'DUPLICATES', 'POSTED', 'EXPIRED', 'DURATION', 'SOURCE_TYPES', 'SOURCES', 'URL', 'ACTIVE_URLS', 'ACTIVE_SOURCES_INFO', 'TITLE_RAW', 'BODY', 'MODELED_EXPIRED', 'MODELED_DURATION', 'COMPANY', 'COMPANY_NAME', 'COMPANY_RAW', 'COMPANY_IS_STAFFING', 'EDUCATION_LEVELS', 'EDUCATION_LEVELS_NAME', 'MIN_EDULEVELS', 'MIN_EDULEVELS_NAME', 'MAX_EDULEVELS', 'MAX_EDULEVELS_NAME', 'EMPLOYMENT_TYPE', 'EMPLOYMENT_TYPE_NAME', 'MIN_YEARS_EXPERIENCE', 'MAX_YEARS_EXPERIENCE', 'IS_INTERNSHIP', 'SALARY', 'REMOTE_TYPE', 'REMOTE_TYPE_NAME', 'ORIGINAL_PAY_PERIOD', 'SALARY_TO', 'SALARY_FROM', 'LOCATION', 'CITY', 'CITY_NAME', 'COUNTY', 'COUNTY_NAME', 'MSA', 'MSA_NAME', 'STATE', 'STATE_NAME', 'COUNTY_OUTGOING', 'COUNTY_NAME_OUTGOING', 'COUNTY_INCOMING', 'COUNTY_NAME_INCOMING', 'MSA_OUTGOING', 'MSA_NAME_OUTGOING', 'MSA_INCOMING', 'MSA_NAME_INCOMING', 'NAICS2', 'NAICS2_NAME', 'NAICS3', 'NAICS3_NAME', 'NAICS4', 'NAICS4_NAME', 'NAICS5', 'NAICS5_NAME', 'NAICS6', 'NAICS6_NAME', 'TITLE', 'TITLE_NAME', 'TITLE_CLEAN', 'SKILLS', 'SKILLS_NAME', 'SPECIALIZED_SKILLS', 'SPECIALIZED_SKILLS_NAME', 'CERTIFICATIONS', 'CERTIFICATIONS_NAME', 'COMMON_SKILLS', 'COMMON_SKILLS_NAME', 'SOFTWARE_SKILLS', 'SOFTWARE_SKILLS_NAME', 'ONET', 'ONET_NAME', 'ONET_2019', 'ONET_2019_NAME', 'CIP6', 'CIP6_NAME', 'CIP4', 'CIP4_NAME', 'CIP2', 'CIP2_NAME', 'SOC_2021_2', 'SOC_2021_2_NAME', 'SOC_2021_3', 'SOC_2021_3_NAME', 'SOC_2021_4', 'SOC_2021_4_NAME', 'SOC_2021_5', 'SOC_2021_5_NAME', 'LOT_CAREER_AREA', 'LOT_CAREER_AREA_NAME', 'LOT_OCCUPATION', 'LOT_OCCUPATION_NAME', 'LOT_SPECIALIZED_OCCUPATION', 'LOT_SPECIALIZED_OCCUPATION_NAME', 'LOT_OCCUPATION_GROUP', 'LOT_OCCUPATION_GROUP_NAME', 'LOT_V6_SPECIALIZED_OCCUPATION', 'LOT_V6_SPECIALIZED_OCCUPATION_NAME', 'LOT_V6_OCCUPATION', 'LOT_V6_OCCUPATION_NAME', 'LOT_V6_OCCUPATION_GROUP', 'LOT_V6_OCCUPATION_GROUP_NAME', 'LOT_V6_CAREER_AREA', 'LOT_V6_CAREER_AREA_NAME', 'SOC_2', 'SOC_2_NAME', 'SOC_3', 'SOC_3_NAME', 'SOC_4', 'SOC_4_NAME', 'SOC_5', 'SOC_5_NAME', 'LIGHTCAST_SECTORS', 'LIGHTCAST_SECTORS_NAME', 'NAICS_2022_2', 'NAICS_2022_2_NAME', 'NAICS_2022_3', 'NAICS_2022_3_NAME', 'NAICS_2022_4', 'NAICS_2022_4_NAME', 'NAICS_2022_5', 'NAICS_2022_5_NAME', 'NAICS_2022_6', 'NAICS_2022_6_NAME']
!pip install missingno
Requirement already satisfied: missingno in ./.venv/lib/python3.12/site-packages (0.5.2)
Requirement already satisfied: numpy in ./.venv/lib/python3.12/site-packages (from missingno) (2.2.6)
Requirement already satisfied: matplotlib in ./.venv/lib/python3.12/site-packages (from missingno) (3.10.3)
Requirement already satisfied: scipy in ./.venv/lib/python3.12/site-packages (from missingno) (1.15.3)
Requirement already satisfied: seaborn in ./.venv/lib/python3.12/site-packages (from missingno) (0.13.2)
Requirement already satisfied: contourpy>=1.0.1 in ./.venv/lib/python3.12/site-packages (from matplotlib->missingno) (1.3.2)
Requirement already satisfied: cycler>=0.10 in ./.venv/lib/python3.12/site-packages (from matplotlib->missingno) (0.12.1)
Requirement already satisfied: fonttools>=4.22.0 in ./.venv/lib/python3.12/site-packages (from matplotlib->missingno) (4.58.0)
Requirement already satisfied: kiwisolver>=1.3.1 in ./.venv/lib/python3.12/site-packages (from matplotlib->missingno) (1.4.8)
Requirement already satisfied: packaging>=20.0 in ./.venv/lib/python3.12/site-packages (from matplotlib->missingno) (25.0)
Requirement already satisfied: pillow>=8 in ./.venv/lib/python3.12/site-packages (from matplotlib->missingno) (11.2.1)
Requirement already satisfied: pyparsing>=2.3.1 in ./.venv/lib/python3.12/site-packages (from matplotlib->missingno) (3.2.3)
Requirement already satisfied: python-dateutil>=2.7 in ./.venv/lib/python3.12/site-packages (from matplotlib->missingno) (2.9.0.post0)
Requirement already satisfied: pandas>=1.2 in ./.venv/lib/python3.12/site-packages (from seaborn->missingno) (2.2.3)
Requirement already satisfied: pytz>=2020.1 in ./.venv/lib/python3.12/site-packages (from pandas>=1.2->seaborn->missingno) (2025.2)
Requirement already satisfied: tzdata>=2022.7 in ./.venv/lib/python3.12/site-packages (from pandas>=1.2->seaborn->missingno) (2025.2)
Requirement already satisfied: six>=1.5 in ./.venv/lib/python3.12/site-packages (from python-dateutil>=2.7->matplotlib->missingno) (1.17.0)
import missingno as msno
import matplotlib.pyplot as plt
# Visualize missing values
msno.heatmap(df)
plt.title("Missing Values Heatmap")
plt.show()

# Drop columns with >50% missing values
df.dropna(thresh=len(df) * 0.5, axis=1, inplace=True)

# Fill only the columns you actually have
if 'Industry' in df.columns:
    df["Industry"].fillna("Unknown", inplace=True)
    df["Salary"].fillna(df["Salary"].median(), inplace=True)

df = df.drop_duplicates(subset=["TITLE", "COMPANY", "LOCATION", "POSTED"], keep="first")
df = df[df['NAICS_2022_2_NAME'] != 'Unclassified Industry']
df['REMOTE_TYPE_NAME'] = df['REMOTE_TYPE_NAME'].replace('[None]', 'Unknown')

import pandas as pd
import plotly.express as px

# Sample data setup (replace this with your actual dataframe)
data = {
    'Industry': [
        'Wholesale Trade', 'Retail Trade', 'Real Estate and Rental and Leasing',
        'Professional, Scientific, and Technical Services', 'Manufacturing',
        'Information', 'Health Care and Social Assistance',
        'Finance and Insurance', 'Educational Services',
        'Administrative and Support and Waste Management and Remediation Services'
    ],
    'Flexible Remote': [87.8, 94.4, 97.6, 92.2, 89.7, 95.8, 92.1, 94.8, 89.0, 94.8],
    'Onsite': [12.2, 5.6, 2.4, 7.8, 10.3, 4.2, 7.9, 5.2, 11.0, 5.2]
}

df = pd.DataFrame(data)

# Melt the data for Plotly
df_melted = df.melt(id_vars='Industry', value_vars=['Flexible Remote', 'Onsite'],
                    var_name='Remote Type', value_name='Percentage')

# Sort industries by Flexible Remote % descending
df['SortKey'] = df['Flexible Remote']
df = df.sort_values(by='SortKey', ascending=False)
df_melted['Industry'] = pd.Categorical(df_melted['Industry'], categories=df['Industry'], ordered=True)

# Plot
fig = px.bar(
    df_melted,
    x="Percentage",
    y="Industry",
    color="Remote Type",
    orientation="h",
    text="Percentage",
    color_discrete_map={"Flexible Remote": "#636EFA", "Onsite": "#EF553B"},
    title="Remote Job Distribution by Industry (Top 10 Industries)"
)

# Clean layout, shift bars left
fig.update_layout(
    xaxis_title="Percentage of Jobs",
    yaxis_title="",
    xaxis=dict(tickformat=".0f"),
    legend_title="Remote Type",
    barmode='stack',
    margin=dict(l=10, r=10, t=60, b=40),  # Reduce margins
    height=500
)

# Format labels
fig.update_traces(texttemplate='%{text:.1f}%', textposition='inside')

fig.show()






import pandas as pd
from IPython.display import display
# 1. Load the dataset and parse dates
df = pd.read_csv("./data/lightcast_job_postings.csv", parse_dates=["POSTED", "EXPIRED"])
# Calculate DURATION in days
df['DURATION'] = (df['EXPIRED'] - df['POSTED']).dt.days
# Extract POSTED month
df['POSTED_MONTH'] = df['POSTED'].dt.to_period("M")
/tmp/ipykernel_2561/784347523.py:4: DtypeWarning:

Columns (19,30) have mixed types. Specify dtype option on import or set low_memory=False.
# Define AI and non-AI impacted NAICS codes
ai_impacted_naics = [31, 32, 33, 42, 44, 45, 51, 52, 54, 55, 56]
non_ai_impacted_naics = [11, 21, 22, 23, 48, 49, 61, 62, 71, 72]
# Create IS_AI_JOB flag
df['IS_AI_JOB'] = df['NAICS_2022_2'].apply(lambda x: 1 if pd.notna(x) and x in ai_impacted_naics else 0)

# Remove rows with missing STATE_NAME or POSTED_MONTH
df = df[df['STATE_NAME'].notna() & df['POSTED_MONTH'].notna()]

# Group by POSTED_MONTH, STATE_NAME, IS_AI_JOB to count jobs
job_counts = (
    df.groupby(['POSTED_MONTH', 'STATE_NAME', 'IS_AI_JOB'])
      .size()
      .reset_index(name='JOB_COUNT')
)
# Pivot to put months as rows, states as columns
pivot_df = job_counts.pivot_table(index=['STATE_NAME', 'IS_AI_JOB'], 
                                   columns='POSTED_MONTH', 
                                   values='JOB_COUNT', 
                                   fill_value=0)

# Calculate % growth from first to last month
first_month = pivot_df.columns[0]
last_month = pivot_df.columns[-1]
pivot_df['PCT_GROWTH'] = ((pivot_df[last_month] - pivot_df[first_month]) / 
                          pivot_df[first_month].replace(0, pd.NA)) * 100

# Drop rows with undefined % growth (divide by zero)
pivot_df = pivot_df.dropna(subset=['PCT_GROWTH'])

# Reset index for sorting
pivot_df = pivot_df.reset_index()
# Get top 10 states by % growth
top_ai = pivot_df[pivot_df['IS_AI_JOB'] == 1].sort_values('PCT_GROWTH', ascending=False).head(10)
top_non_ai = pivot_df[pivot_df['IS_AI_JOB'] == 0].sort_values('PCT_GROWTH', ascending=False).head(10)

# Combine and label
top_combined = pd.concat([
    top_ai.assign(JOB_TYPE='AI'),
    top_non_ai.assign(JOB_TYPE='Non-AI')
])

# Display 
display(top_combined[['STATE_NAME', 'IS_AI_JOB', 'PCT_GROWTH', 'JOB_TYPE']])
POSTED_MONTH STATE_NAME IS_AI_JOB PCT_GROWTH JOB_TYPE
101 Wyoming 1 380.000000 AI
97 West Virginia 1 380.000000 AI
81 South Dakota 1 175.000000 AI
21 Hawaii 1 75.000000 AI
67 North Dakota 1 64.285714 AI
61 New Mexico 1 56.000000 AI
73 Oregon 1 54.716981 AI
41 Massachusetts 1 40.074906 AI
95 Washington, D.C. (District of Columbia) 1 36.363636 AI
53 Nebraska 1 36.065574 AI
66 North Dakota 0 275.000000 Non-AI
88 Vermont 0 225.000000 Non-AI
96 West Virginia 0 130.000000 Non-AI
52 Nebraska 0 100.000000 Non-AI
72 Oregon 0 100.000000 Non-AI
2 Alaska 0 92.307692 Non-AI
32 Kentucky 0 53.125000 Non-AI
94 Washington, D.C. (District of Columbia) 0 40.000000 Non-AI
48 Missouri 0 37.931034 Non-AI
60 New Mexico 0 36.842105 Non-AI
def label_ai_impact(naics):
    try:
        code = int(naics)
        if code in ai_impacted_naics:
            return "AI-Impacted"
        elif code in non_ai_impacted_naics:
            return "Non-AI-Impacted"
        else:
            return "Unclassified"
    except:
        return "Unclassified"
df['AI_IMPACTED'] = df['NAICS_2022_2'].apply(label_ai_impact)
# Filter only AI and Non-AI impacted
df = df[df["AI_IMPACTED"].isin(["AI-Impacted", "Non-AI-Impacted"])]

# Now split the data
ai_df = df[df["AI_IMPACTED"] == "AI-Impacted"][
    ["STATE_NAME", "NAICS_2022_2", "NAICS_2022_2_NAME", "LOT_SPECIALIZED_OCCUPATION_NAME"]
]

non_ai_df = df[df["AI_IMPACTED"] == "Non-AI-Impacted"][
    ["STATE_NAME", "NAICS_2022_2", "NAICS_2022_2_NAME", "LOT_SPECIALIZED_OCCUPATION_NAME"]
]
from IPython.display import display

print("AI-Impacted Industries:")
display(ai_df)

print("\nNon-AI-Impacted Industries:")
display(non_ai_df)
AI-Impacted Industries:
STATE_NAME NAICS_2022_2 NAICS_2022_2_NAME LOT_SPECIALIZED_OCCUPATION_NAME
0 Arkansas 44.0 Retail Trade General ERP Analyst / Consultant
1 Maine 56.0 Administrative and Support and Waste Managemen... Oracle Consultant / Analyst
2 Texas 52.0 Finance and Insurance Data Analyst
3 Arizona 52.0 Finance and Insurance Data Analyst
5 Arkansas 51.0 Information Data Analyst
... ... ... ... ...
72493 Virginia 54.0 Professional, Scientific, and Technical Services Data Analyst
72494 Massachusetts 51.0 Information Enterprise Architect
72495 Michigan 56.0 Administrative and Support and Waste Managemen... Data Analyst
72496 Maine 51.0 Information Data Analyst
72497 Texas 54.0 Professional, Scientific, and Technical Services Oracle Consultant / Analyst

49905 rows × 4 columns


Non-AI-Impacted Industries:
STATE_NAME NAICS_2022_2 NAICS_2022_2_NAME LOT_SPECIALIZED_OCCUPATION_NAME
15 Massachusetts 61.0 Educational Services Data Analyst
18 Alabama 62.0 Health Care and Social Assistance Enterprise Architect
37 Virginia 62.0 Health Care and Social Assistance Data Analyst
77 Ohio 23.0 Construction Enterprise Architect
94 Texas 61.0 Educational Services Business Analyst (General)
... ... ... ... ...
72450 Oklahoma 48.0 Transportation and Warehousing Financial Data Analyst
72451 South Carolina 62.0 Health Care and Social Assistance Data Analyst
72460 California 11.0 Agriculture, Forestry, Fishing and Hunting Data Analyst
72479 Wyoming 61.0 Educational Services Data Analyst
72489 Texas 22.0 Utilities SAP Analyst / Admin

7627 rows × 4 columns

# Step 1: Count total jobs by state
total_jobs_by_state = df.groupby('STATE_NAME').size().rename('Total_Jobs')
# Step 2: Count AI-impacted jobs by state
ai_jobs_by_state = df[df['AI_IMPACTED'] == 'AI-Impacted'].groupby('STATE_NAME').size().rename('AI_Impacted_Jobs')
# Step 3: Merge the two counts into a single DataFrame
ai_impact_summary = pd.concat([total_jobs_by_state, ai_jobs_by_state], axis=1).fillna(0)
# Step 4: Calculate percentage of AI-impacted jobs
ai_impact_summary['AI_Impact_Percentage'] = (ai_impact_summary['AI_Impacted_Jobs'] / ai_impact_summary['Total_Jobs']) * 100
# Step 5: Sort states by highest percentage
ai_impact_summary_sorted = ai_impact_summary.sort_values(by='AI_Impact_Percentage', ascending=False)
# Step 6: Display the top 10 states (or all if you prefer)
from IPython.display import display
display(ai_impact_summary_sorted.head(10))
Total_Jobs AI_Impacted_Jobs AI_Impact_Percentage
STATE_NAME
New Jersey 2156 1976 91.651206
Vermont 201 183 91.044776
Idaho 399 361 90.476190
North Carolina 2225 2013 90.471910
Connecticut 723 654 90.456432
Hawaii 183 165 90.163934
Washington, D.C. (District of Columbia) 933 839 89.924973
Wisconsin 852 766 89.906103
Illinois 2802 2515 89.757316
Virginia 2919 2619 89.722508
# Step 1: Clean and convert date columns
df = df[df['NAICS_2022_2_NAME'] != 'Unclassified Industry']
df['POSTED'] = pd.to_datetime(df['POSTED'], errors='coerce')
df['EXPIRED'] = pd.to_datetime(df['EXPIRED'], errors='coerce')

# Drop rows with missing POSTED or EXPIRED dates
df = df.dropna(subset=['POSTED', 'EXPIRED'])
import plotly.express as px

# Count postings per industry
industry_counts = df['NAICS2_NAME'].value_counts().reset_index()
industry_counts.columns = ['NAICS2_NAME', 'count']

# Sort values for better readability
industry_counts = industry_counts.sort_values(by='count', ascending=True)

# Horizontal bar plot
fig = px.bar(
    industry_counts,
    x='count',
    y='NAICS2_NAME',
    orientation='h',
    title='Job Postings by Industry',
    labels={'NAICS2_NAME': 'Industry', 'count': 'Number of Postings'},
    color='count',
    color_continuous_scale='Blues'
)

# Clean layout
fig.update_layout(
    yaxis_title='Industry',
    xaxis_title='Number of Postings',
    title_font_size=20,
    plot_bgcolor='white',
    xaxis=dict(showgrid=True),
    yaxis=dict(showgrid=False)
)

fig.show()
# Extract month from POSTED and EXPIRED
df['POSTED_MONTH'] = df['POSTED'].dt.to_period('M').astype(str)
df['EXPIRED_MONTH'] = df['EXPIRED'].dt.to_period('M').astype(str)
# Step 2: Define AI and non-AI industry codes using NAICS 2-digit level
ai_impacted_naics = [31, 32, 33, 42, 44, 45, 51, 52, 54, 55, 56]
non_ai_impacted_naics = [11, 21, 22, 23, 48, 49, 61, 62, 71, 72]
def label_ai(naics):
    try:
        code = int(naics)
        if code in ai_naics:
            return "AI-Impacted"
        elif code in non_ai_naics:
            return "Non-AI-Impacted"
        else:
            return "Unclassified"
    except:
        return "Unclassified"
    df['AI_IMPACTED'] = df['NAICS_2022_2'].apply(label_ai)
    df = df[df['AI_IMPACTED'] != 'Unclassified']
# Extract posting month
df['POSTED_MONTH'] = df['POSTED'].dt.to_period('M')

# Count jobs by month, state, and AI impact
monthly_counts = (
    df.groupby(['STATE_NAME', 'POSTED_MONTH', 'AI_IMPACTED'])
    .size()
    .reset_index(name='Job_Count')
)

# Pivot table to split AI vs Non-AI job counts
pivoted = monthly_counts.pivot_table(
    index=['STATE_NAME', 'POSTED_MONTH'],
    columns='AI_IMPACTED',
    values='Job_Count',
    fill_value=0
).reset_index()
# Sort by month for each state
pivoted.sort_values(by=['STATE_NAME', 'POSTED_MONTH'], inplace=True)

# Step 6: Sort and calculate percent growth
pivoted.sort_values(['STATE_NAME', 'POSTED_MONTH'], inplace=True)
pivoted['AI_Growth'] = pivoted.groupby('STATE_NAME')['AI-Impacted'].pct_change() * 100
pivoted['Non_AI_Growth'] = pivoted.groupby('STATE_NAME')['Non-AI-Impacted'].pct_change() * 100

# Drop rows with NaN (typically the first month per state)
pivoted.dropna(subset=['AI_Growth', 'Non_AI_Growth'], inplace=True)


# Step 7: Average monthly growth by state
average_growth = pivoted.groupby('STATE_NAME')[['AI_Growth', 'Non_AI_Growth']].mean().reset_index()
print(average_growth)
AI_IMPACTED                               STATE_NAME   AI_Growth  \
0                                            Alabama   -1.030391   
1                                             Alaska    0.474691   
2                                            Arizona   -0.396808   
3                                           Arkansas   -6.340944   
4                                         California   -7.115259   
5                                           Colorado   -3.534986   
6                                        Connecticut  -10.133298   
7                                           Delaware  -10.990546   
8                                            Florida   -9.705992   
9                                            Georgia  -15.220526   
10                                            Hawaii   70.988394   
11                                             Idaho  -16.914886   
12                                          Illinois  -17.089139   
13                                           Indiana   -0.969177   
14                                              Iowa   -9.540767   
15                                            Kansas   -6.636718   
16                                          Kentucky   10.591566   
17                                         Louisiana    8.497502   
18                                             Maine   -9.323389   
19                                          Maryland  -14.177059   
20                                     Massachusetts   -3.369591   
21                                          Michigan  -13.326328   
22                                         Minnesota   -5.627136   
23                                       Mississippi   28.334651   
24                                          Missouri  -14.850411   
25                                           Montana   25.160256   
26                                          Nebraska   10.707044   
27                                            Nevada  -10.497371   
28                                     New Hampshire    4.255144   
29                                        New Jersey  -11.443245   
30                                        New Mexico   10.431647   
31                                          New York  -10.012403   
32                                    North Carolina   -8.164973   
33                                      North Dakota   12.428143   
34                                              Ohio   -8.703213   
35                                          Oklahoma    3.762787   
36                                            Oregon    5.138959   
37                                      Pennsylvania   -9.312598   
38                                      Rhode Island    3.863502   
39                                    South Carolina   -4.634607   
40                                      South Dakota   30.021078   
41                                         Tennessee  -10.719712   
42                                             Texas   -9.760556   
43                                              Utah   10.119048   
44                                           Vermont    5.855271   
45                                          Virginia   -9.747373   
46                                        Washington   -7.818786   
47           Washington, D.C. (District of Columbia)   -2.999583   
48                                     West Virginia   99.738820   
49                                         Wisconsin   -7.330027   
50                                           Wyoming  135.099638   

AI_IMPACTED  Non_AI_Growth  
0                 0.127262  
1                59.226190  
2                -9.089155  
3                36.842105  
4               -12.236745  
5                 4.321175  
6                -9.659091  
7                 7.817460  
8                -1.537571  
9               -14.637446  
10               35.000000  
11               18.636364  
12               -7.741935  
13               -0.118012  
14                4.960664  
15                2.532468  
16                7.008929  
17                9.998474  
18               -2.803030  
19              -11.173274  
20              -12.993233  
21              -15.837507  
22               -8.486483  
23               39.780220  
24               -4.427664  
25               50.000000  
26               55.208333  
27                1.041667  
28               23.541667  
29               -3.758394  
30               18.700397  
31              -19.185690  
32                5.121997  
33                     inf  
34              -13.983037  
35                8.686805  
36               -1.111111  
37               -6.975156  
38               49.196429  
39                4.391724  
40              146.875000  
41               -6.933685  
42              -13.584471  
43               -9.662641  
44               79.166667  
45               -9.500393  
46               -6.715888  
47                5.416195  
48               40.000000  
49              -11.805556  
50                     inf  
# Avoid division by zero when calculating percentage change
pivoted['AI_Growth_Pct'] = (
    pivoted.groupby('STATE_NAME')['AI-Impacted']
    .pct_change()
    .replace([np.inf, -np.inf], np.nan)
    * 100
)

pivoted['Non_AI_Growth_Pct'] = (
    pivoted.groupby('STATE_NAME')['Non-AI-Impacted']
    .pct_change()
    .replace([np.inf, -np.inf], np.nan)
    * 100
)



# Drop rows where either value is still NaN (clean rows only)
pivoted.dropna(subset=['AI_Growth_Pct', 'Non_AI_Growth_Pct'], inplace=True)
avg_growth = (
    pivoted.groupby('STATE_NAME')[['AI_Growth_Pct', 'Non_AI_Growth_Pct']]
    .mean()
    .reset_index()
    .rename(columns={
        'AI_Growth_Pct': 'Avg_AI_Job_Growth',
        'Non_AI_Growth_Pct': 'Avg_Non_AI_Job_Growth'
    })
)


# Top 10 states for AI job growth
top_ai = avg_growth.sort_values(by='Avg_AI_Job_Growth', ascending=False).head(10)
print("🔹 Top 10 States for AI Job Growth:")
print(top_ai)

# Top 10 states for Non-AI job growth
top_non_ai = avg_growth.sort_values(by='Avg_Non_AI_Job_Growth', ascending=False).head(10)
print("\n🔹 Top 10 States for Non-AI Job Growth:")
print(top_non_ai)
🔹 Top 10 States for AI Job Growth:
AI_IMPACTED    STATE_NAME  Avg_AI_Job_Growth  Avg_Non_AI_Job_Growth
40           South Dakota          29.133065             -18.750000
43                   Utah          26.428571             -34.893048
35               Oklahoma          13.716049              41.245791
30             New Mexico          10.673077              15.178571
2                 Arizona           8.845947             -13.677419
25                Montana           7.692308             300.000000
44                Vermont           6.586022              75.000000
5                Colorado           5.324124              -3.195489
0                 Alabama           1.825137              31.250000
6             Connecticut           0.980392               0.000000

🔹 Top 10 States for Non-AI Job Growth:
AI_IMPACTED     STATE_NAME  Avg_AI_Job_Growth  Avg_Non_AI_Job_Growth
25                 Montana           7.692308             300.000000
1                   Alaska         -18.633952             116.666667
44                 Vermont           6.586022              75.000000
48           West Virginia         -51.612903              60.000000
28           New Hampshire         -12.554113              49.166667
35                Oklahoma          13.716049              41.245791
11                   Idaho         -27.073820              37.272727
14                    Iowa         -12.772010              31.250000
0                  Alabama           1.825137              31.250000
10                  Hawaii          -2.547022              25.000000
import plotly.express as px

# AI job growth plot
fig_ai = px.bar(
    top_ai,
    x='STATE_NAME',
    y='Avg_AI_Job_Growth',
    title='Top 10 States by Average AI Job Growth',
    labels={'STATE_NAME': 'State', 'Avg_AI_Job_Growth': 'Avg % AI Job Growth'},
    text='Avg_AI_Job_Growth'
)
fig_ai.update_traces(texttemplate='%{text:.2f}%', textposition='outside')
fig_ai.update_layout(yaxis_title='Average % Growth', xaxis_title='State')
fig_ai.show()

# Non-AI job growth plot
fig_non_ai = px.bar(
    top_non_ai,
    x='STATE_NAME',
    y='Avg_Non_AI_Job_Growth',
    title='Top 10 States by Average Non-AI Job Growth',
    labels={'STATE_NAME': 'State', 'Avg_Non_AI_Job_Growth': 'Avg % Non-AI Job Growth'},
    text='Avg_Non_AI_Job_Growth'
)
fig_non_ai.update_traces(texttemplate='%{text:.2f}%', textposition='outside')
fig_non_ai.update_layout(yaxis_title='Average % Growth', xaxis_title='State')
fig_non_ai.show()
df['AVERAGE_SALARY'] = df[['SALARY_FROM', 'SALARY_TO']].mean(axis=1)
---------------------------------------------------------------------------
KeyError                                  Traceback (most recent call last)
Cell In[95], line 1
----> 1 df['AVERAGE_SALARY'] = df[['SALARY_FROM', 'SALARY_TO']].mean(axis=1)

File ~/metad688-job-market-analysis-for-summer-2025-group2/.venv/lib/python3.12/site-packages/pandas/core/frame.py:4108, in DataFrame.__getitem__(self, key)
   4106     if is_iterator(key):
   4107         key = list(key)
-> 4108     indexer = self.columns._get_indexer_strict(key, "columns")[1]
   4110 # take() does not accept boolean indexers
   4111 if getattr(indexer, "dtype", None) == bool:

File ~/metad688-job-market-analysis-for-summer-2025-group2/.venv/lib/python3.12/site-packages/pandas/core/indexes/base.py:6200, in Index._get_indexer_strict(self, key, axis_name)
   6197 else:
   6198     keyarr, indexer, new_indexer = self._reindex_non_unique(keyarr)
-> 6200 self._raise_if_missing(keyarr, indexer, axis_name)
   6202 keyarr = self.take(indexer)
   6203 if isinstance(key, Index):
   6204     # GH 42790 - Preserve name from an Index

File ~/metad688-job-market-analysis-for-summer-2025-group2/.venv/lib/python3.12/site-packages/pandas/core/indexes/base.py:6249, in Index._raise_if_missing(self, key, indexer, axis_name)
   6247 if nmissing:
   6248     if nmissing == len(indexer):
-> 6249         raise KeyError(f"None of [{key}] are in the [{axis_name}]")
   6251     not_found = list(ensure_index(key)[missing_mask.nonzero()[0]].unique())
   6252     raise KeyError(f"{not_found} not in index")

KeyError: "None of [Index(['SALARY_FROM', 'SALARY_TO'], dtype='object')] are in the [columns]"
avg_salary_by_state_type = (
    df.groupby(['STATE_NAME', 'AI_IMPACTED'])['AVERAGE_SALARY']
    .mean()
    .reset_index()
)
---------------------------------------------------------------------------
KeyError                                  Traceback (most recent call last)
Cell In[92], line 2
      1 avg_salary_by_state_type = (
----> 2     df.groupby(['STATE_NAME', 'AI_IMPACTED'])['AVERAGE_SALARY']
      3     .mean()
      4     .reset_index()
      5 )

File ~/metad688-job-market-analysis-for-summer-2025-group2/.venv/lib/python3.12/site-packages/pandas/core/groupby/generic.py:1951, in DataFrameGroupBy.__getitem__(self, key)
   1944 if isinstance(key, tuple) and len(key) > 1:
   1945     # if len == 1, then it becomes a SeriesGroupBy and this is actually
   1946     # valid syntax, so don't raise
   1947     raise ValueError(
   1948         "Cannot subset columns with a tuple with more than one element. "
   1949         "Use a list instead."
   1950     )
-> 1951 return super().__getitem__(key)

File ~/metad688-job-market-analysis-for-summer-2025-group2/.venv/lib/python3.12/site-packages/pandas/core/base.py:244, in SelectionMixin.__getitem__(self, key)
    242 else:
    243     if key not in self.obj:
--> 244         raise KeyError(f"Column not found: {key}")
    245     ndim = self.obj[key].ndim
    246     return self._gotitem(key, ndim=ndim)

KeyError: 'Column not found: AVERAGE_SALARY'
import numpy as np

# Step 1: Replace infinite values (result of division by zero) with NaN
pivoted.replace([np.inf, -np.inf], np.nan, inplace=True)

# Step 2: Drop rows with missing percentage growth values
pivoted.dropna(subset=['AI_Growth_Pct', 'Non_AI_Growth_Pct'], inplace=True)

# Step 3: Group by state and calculate average percent growth
avg_growth = pivoted.groupby('STATE_NAME')[['AI_Growth_Pct', 'Non_AI_Growth_Pct']].mean().reset_index()

# Step 4: Sort states by AI and Non-AI growth
ranked = avg_growth.sort_values(by=['AI_Growth_Pct', 'Non_AI_Growth_Pct'], ascending=False)

# Step 5: Display top states for AI and Non-AI job growth
from IPython.display import display
print("Top states by AI job growth:")
display(ranked[['STATE_NAME', 'AI_Growth_Pct']].head(10))

print("\nTop states by Non-AI job growth:")
display(ranked[['STATE_NAME', 'Non_AI_Growth_Pct']].sort_values(by='Non_AI_Growth_Pct', ascending=False).head(10))
Top states by AI job growth:
AI_IMPACTED STATE_NAME AI_Growth_Pct
50 Wyoming 78.532609
25 Montana 45.827710
48 West Virginia 32.985093
40 South Dakota 12.755376
36 Oregon 1.779482
16 Kentucky 1.127738
30 New Mexico 0.865385
43 Utah 0.158730
20 Massachusetts 0.114257
44 Vermont -0.238949

Top states by Non-AI job growth:
AI_IMPACTED STATE_NAME Non_AI_Growth_Pct
33 North Dakota 533.333333
40 South Dakota 220.833333
38 Rhode Island 94.166667
25 Montana 77.777778
50 Wyoming 75.000000
3 Arkansas 65.789474
1 Alaska 53.968254
48 West Virginia 53.333333
11 Idaho 47.070707
30 New Mexico 43.452381
import seaborn as sns
import matplotlib.pyplot as plt

# Step 9: Visualize
plt.figure(figsize=(14, 6))
avg_growth[['AI_Growth', 'Non_AI_Growth']].head(10).plot(kind='bar')
plt.title('Average Monthly Job Growth (Top 10 States) for AI vs. Non-AI Careers')
plt.ylabel('Average Monthly Job Growth')
plt.xticks(rotation=45)
plt.tight_layout()
plt.grid(True)
plt.show()
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[4], line 6
      4 # Step 9: Visualize
      5 plt.figure(figsize=(14, 6))
----> 6 avg_growth[['AI_Growth', 'Non_AI_Growth']].head(10).plot(kind='bar')
      7 plt.title('Average Monthly Job Growth (Top 10 States) for AI vs. Non-AI Careers')
      8 plt.ylabel('Average Monthly Job Growth')

NameError: name 'avg_growth' is not defined
<Figure size 1400x600 with 0 Axes>
# Filter for Boston, MA and Austin, TX
selected_state = ['California', 'Florida', 'Massachusetts', 'Texas', 'New York']
filtered_df = df[df['STATE_NAME'].isin(selected_state)]

# Further filter for NAICS_2022_6 = 518210 and show relevant columns
final_df = filtered_df[filtered_df['LOT_SPECIALIZED_OCCUPATION_NAME'].str.contains('analyst', case=False, na=False)]
final_df[['STATE_NAME', 'NAICS2_NAME', 'NAICS_2022_6', 'LOT_SPECIALIZED_OCCUPATION_NAME']].head(100)
STATE_NAME NAICS2_NAME NAICS_2022_6 LOT_SPECIALIZED_OCCUPATION_NAME
2 Texas Finance and Insurance 524291.0 Data Analyst
4 California Unclassified Industry 999999.0 Oracle Consultant / Analyst
9 New York Professional, Scientific, and Technical Services 541511.0 Data Analyst
10 California Wholesale Trade 423830.0 Data Analyst
15 Massachusetts Educational Services 611310.0 Data Analyst
... ... ... ... ...
294 Florida Educational Services 611310.0 SAP Analyst / Admin
295 California Finance and Insurance 524114.0 Data Analyst
296 New York Unclassified Industry 999999.0 General ERP Analyst / Consultant
297 Texas Professional, Scientific, and Technical Services 541611.0 SAP Analyst / Admin
299 Texas Professional, Scientific, and Technical Services 541511.0 General ERP Analyst / Consultant

100 rows × 4 columns

# Filter for Boston, MA and Austin, TX
selected_state = ['California', 'Florida', 'Massachusetts', 'Texas', 'New York']
filtered_df = df[df['STATE_NAME'].isin(selected_state)]
# Further filter for NAICS_2022_6 = 518210 and show relevant columns
final_df = filtered_df[filtered_df['LOT_SPECIALIZED_OCCUPATION_NAME'].str.contains('analyst', case=False, na=False)]
final_df[['STATE_NAME', 'NAICS2_NAME', 'NAICS_2022_6', 'LOT_SPECIALIZED_OCCUPATION_NAME']].head(100)
STATE_NAME NAICS2_NAME NAICS_2022_6 LOT_SPECIALIZED_OCCUPATION_NAME
2 Texas Finance and Insurance 524291.0 Data Analyst
4 California Unclassified Industry 999999.0 Oracle Consultant / Analyst
9 New York Professional, Scientific, and Technical Services 541511.0 Data Analyst
10 California Wholesale Trade 423830.0 Data Analyst
15 Massachusetts Educational Services 611310.0 Data Analyst
... ... ... ... ...
294 Florida Educational Services 611310.0 SAP Analyst / Admin
295 California Finance and Insurance 524114.0 Data Analyst
296 New York Unclassified Industry 999999.0 General ERP Analyst / Consultant
297 Texas Professional, Scientific, and Technical Services 541611.0 SAP Analyst / Admin
299 Texas Professional, Scientific, and Technical Services 541511.0 General ERP Analyst / Consultant

100 rows × 4 columns

import matplotlib.pyplot as plt
import numpy as np

# Group by STATE_NAME and count jobs for NAICS_2022_6 = 518210
state_counts_jobs = final_df.groupby('STATE_NAME').size().reset_index(name='job_count')

# Sort state_counts_jobs from greatest to least by job_count
state_counts_jobs_sorted = state_counts_jobs.sort_values(by='job_count', ascending=False)

# Plot column chart
plt.figure(figsize=(8, 5))
colors = plt.cm.coolwarm(np.linspace(0, 1, len(state_counts_jobs_sorted)))
plt.bar(state_counts_jobs_sorted['STATE_NAME'], state_counts_jobs_sorted['job_count'], color=colors)
plt.xlabel('State')
plt.ylabel('Number of Jobs')
plt.title('Tech Jobs by State (Job Title Contains "Analyst")')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()